# Libraries
library(tidyverse)
library(ggplot2)
library(ggrepel)
library(ggpmisc)
library(broom)
library(data.table) 
library(dplyr)
library(msm)
library(plm)

sigma = 4. # Elasticity of substitution for calculating amplification 

# change to path where you saved the replication script
wkdir <- "/Users/huiyuli/Dropbox/Entry Technology/JPE macro submission/Replication Script"

########## Aggregate plots ########## 
# Import data on national real gdp, number of firms, and employment
data_gdp <- fread(paste(wkdir, "raw data/df_nationalrealgdp.csv",  sep="/"))
data_bds <- fread(paste(wkdir, "raw data/bds2020.csv",  sep="/"),  select = c("year", "firms", "emp"))
data_gdp_bds <- merge(data_gdp, data_bds, by.x = "year", by.y = "year", all.x = FALSE, all.y = TRUE)
data_gdp_bds <-mutate(data_gdp_bds, ln_GDP_worker = log(real_gdp / emp),
                      ln_emp_per_firm = log(emp / firms),
                      ln_emp = log(emp),
                      ln_firms = log(firms))

# Figure 1 (a). Regress the log firms on log emp 
eq <- lm(ln_firms ~ ln_emp, data = data_gdp_bds)
tidy_eq <- tidy(eq)
regtext <- paste(paste0('y = ', round(summary(eq)$coefficients[2, 1], digits = 2), 'x + ', round(summary(eq)$coefficients[1,1], digits = 2)),
                 paste0('SE: ', round(summary(eq)$coefficients[2,2], digits = 2)),
                 paste0('R-squared: ', round(summary(eq)$r.squared, digits = 2)),
                 sep = '\n')
regtext <- ''   # comment out to add regression results to the graph
national_firms_emp <- ggplot(data_gdp_bds, aes(x = ln_emp, y = ln_firms)) +
  geom_point() +
  stat_poly_line(se = FALSE) +
  theme(aspect.ratio=1) +
  #coord_fixed(ratio=1) +
  geom_text(data = data.frame(annotateText = regtext),
            aes(x = -Inf, y = Inf, hjust = 0, vjust = 1.25, label = annotateText), size = 5) +
  geom_text_repel(aes(label = year), max.overlaps = 75, min.segment.length=0) +
  scale_x_continuous(breaks = seq(18.0, 18.8, .2), labels = seq(18.0, 18.8, .2), limits = c(18.0, 18.8)) + 
  scale_y_continuous(breaks = seq(14.9, 15.7, .2), labels = seq(14.9, 15.7, .2), limits = c(14.9, 15.7)) +
  theme_bw() +
  #labs(caption = 'Source: BDS') +
  ylab("log Firms") +
  xlab("log Employment") + 
  theme(axis.title = element_text(size = 18))
ggsave(paste(wkdir, "output/figures/figure1a.png",  sep="/"), plot = national_firms_emp, device = "png")


# Figure 1 (b). Regress the log emp per firm on log real gdp per emp
eq <- lm( ln_emp_per_firm ~ ln_GDP_worker, data = data_gdp_bds)
tidy_eq <- tidy(eq)
regtext <- paste(paste0('y = ', round(summary(eq)$coefficients[2, 1], digits = 2), 'x + ', round(summary(eq)$coefficients[1,1], digits = 2)),
                 paste0('SE: ', round(summary(eq)$coefficients[2,2], digits = 2)),
                 paste0('R-squared: ', round(summary(eq)$r.squared, digits = 2)),
                 sep = '\n')
regtext <- ''   # comment out to add regression results to the graph
national_firms_prod  <- ggplot(data_gdp_bds, aes(x = ln_GDP_worker, y = ln_emp_per_firm)) +
  geom_point() +
  stat_poly_line(se = FALSE) +
  #coord_fixed(ratio=1)+
  theme(aspect.ratio=1) +
  geom_text(data = data.frame(annotateText = regtext),
            aes(x = -Inf, y = Inf, hjust = 0, vjust = 1.25, label = annotateText), size = 5) +
  geom_text_repel(aes(label = year), max.overlaps = 75, min.segment.length=0) +
  scale_x_continuous(breaks = seq(-9.4, -8.8, .1), labels = seq(-9.4, -8.8, .1), limits = c(-9.4, -8.8)) + 
  scale_y_continuous(breaks = seq(2.7, 3.3, .1), labels = seq(2.7, 3.3, .1), limits = c(2.7, 3.3)) +
  theme_bw() +
  #labs(caption = 'Source: BDS') +
  ylab("log Employment per firm") +
  xlab("log GDP per worker") + 
  theme(axis.title = element_text(size = 18))
ggsave(paste(wkdir, "output/figures/figure1b.png", sep="/"), plot = national_firms_prod, device = "png")


########## Table 1 regression results ########## 
data_gdp <- fread(paste(wkdir, "raw data/df_nationalrealgdp.csv",  sep="/"))
data_bds <- fread(paste(wkdir, "raw data/bds2020.csv",  sep="/"), select = c("year", "firms", "emp"))
data_gdp_bds <- merge(data_bds, data_gdp, by.x = "year", by.y = "year", all.x = TRUE, all.y = FALSE)
data_gdp_bds <-mutate(data_gdp_bds, ln_GDP_worker = log(real_gdp / emp),
                      ln_emp_per_firm = log(emp / firms),
                      ln_emp = log(emp),
                      ln_firms = log(firms))
data_gdp_bds$ln_firms_lagged <- dplyr::lag(data_gdp_bds$ln_firms, n=1, order_by=data_gdp_bds$year)
data_bds_entrant <- fread(paste(wkdir, "raw data/bds2020_fa.csv",  sep="/"),  
                          select = c("year", "fage", "firms", "emp"))[grep('a) 0', fage)][, c("year", "firms", "emp")]
data_bds_entrant <- as.data.frame(sapply(data_bds_entrant, as.numeric)) 
data_bds_entrant <- mutate(data_bds_entrant, ln_emp_per_firm_entrant = log(emp / firms))
data_gdp_bds <- merge(data_gdp_bds, data_bds_entrant[, c("year", "ln_emp_per_firm_entrant")], by.x = "year", by.y = "year", all.x = TRUE, all.y = FALSE)


# Col 1
eq_col <- lm( ln_emp_per_firm ~ ln_GDP_worker, data = data_gdp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["ln_GDP_worker", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["ln_GDP_worker", "Std. Error"], digits = 3)
phi = 0
se_phi = 0
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(sigma-1+coef_prod)*100
amp_se = deltamethod(~ -x2/(sigma-1+x2)*100, coef(eq_col), vcov(eq_col))
table.col1 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

# Col 2
eq_col <- lm(ln_emp_per_firm ~ ln_GDP_worker + ln_firms_lagged, data = data_gdp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["ln_GDP_worker", "Estimate"]
coef_firm = summary(eq_col)$coefficients["ln_firms_lagged", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["ln_GDP_worker", "Std. Error"], digits = 3)
phi = round(-coef_firm, digits= 3)
se_phi = round(summary(eq_col)$coefficients["ln_firms_lagged", "Std. Error"], digits = 3)
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(1+coef_firm)/(sigma-1+coef_prod/(1+coef_firm))*100
amp_se = deltamethod(~ -x2/(1+x3)/(sigma-1+x2/(1+x3))*100, coef(eq_col), vcov(eq_col))
table.col2 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

# Col 3
eq_col <- lm( ln_emp_per_firm_entrant ~ ln_GDP_worker, data = data_gdp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["ln_GDP_worker", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["ln_GDP_worker", "Std. Error"], digits = 3)
phi = 0
se_phi = 0
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(sigma-1+coef_prod)*100
amp_se = deltamethod(~ -x2/(sigma-1+x2)*100, coef(eq_col), vcov(eq_col))
table.col3 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

# Col 4
eq_col <- lm(ln_emp_per_firm_entrant ~ ln_GDP_worker + ln_firms_lagged, data = data_gdp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["ln_GDP_worker", "Estimate"]
coef_firm = summary(eq_col)$coefficients["ln_firms_lagged", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["ln_GDP_worker", "Std. Error"], digits = 3)
phi = round(-coef_firm, digits= 3)
se_phi = round(summary(eq_col)$coefficients["ln_firms_lagged", "Std. Error"], digits = 3)
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(1+coef_firm)/(sigma-1+coef_prod/(1+coef_firm))*100
amp_se = deltamethod(~ -x2/(1+x3)/(sigma-1+x2/(1+x3))*100, coef(eq_col), vcov(eq_col))
table.col4 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

table <- cbind(table.col1, table.col2, table.col3,table.col4)
table <- round(table, digits=3)
rnames <- c("lambdar", "se", "phi", "se", "R2", "N", "amp", "se")
cnames <- c("All firms","All firms lagged N","New firms","New firms lagged N")
table1 <- matrix(table,nrow=8,byrow=FALSE,dimnames=list(rnames,cnames))
View(table1)

########## Table A2 regression results ########## 
data_gdp <- fread(paste(wkdir, "raw data/df_nationalrealgdp.csv",  sep="/"))
data_bds <- fread(paste(wkdir, "raw data/bds2020.csv",  sep="/"), select = c("year", "estabs", "emp"))
data_gdp_bds <- merge(data_bds, data_gdp, by.x = "year", by.y = "year", all.x = TRUE, all.y = FALSE)
data_gdp_bds <-mutate(data_gdp_bds, ln_GDP_worker = log(real_gdp / emp),
                      ln_emp_per_estab = log(emp / estabs),
                      ln_emp = log(emp),
                      ln_estabs = log(estabs))
data_gdp_bds$ln_estabs_lagged <- dplyr::lag(data_gdp_bds$ln_estabs, n=1)
data_bds_entrant <- fread(paste(wkdir, "raw data/bds2020_ea.csv",  sep="/"),  
                          select = c("year", "eage", "estabs", "emp"))[grep('a) 0', eage)][, c("year", "estabs", "emp")]
data_bds_entrant <- as.data.frame(sapply(data_bds_entrant, as.numeric)) 
data_bds_entrant <- mutate(data_bds_entrant, ln_emp_per_estab_entrant = log(emp / estabs))
data_gdp_bds <- merge(data_gdp_bds, data_bds_entrant[, c("year", "ln_emp_per_estab_entrant")], by.x = "year", by.y = "year", all.x = TRUE, all.y = FALSE)

sigma = 4


# Col 1
eq_col <- lm( ln_emp_per_estab ~ ln_GDP_worker, data = data_gdp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["ln_GDP_worker", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["ln_GDP_worker", "Std. Error"], digits = 3)
phi = 0
se_phi = 0
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(sigma-1+coef_prod)*100
amp_se = deltamethod(~ -x2/(sigma-1+x2)*100, coef(eq_col), vcov(eq_col))
table.col1 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

# Col 2
eq_col <- lm(ln_emp_per_estab ~ ln_GDP_worker + ln_estabs_lagged, data = data_gdp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["ln_GDP_worker", "Estimate"]
coef_estab = summary(eq_col)$coefficients["ln_estabs_lagged", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["ln_GDP_worker", "Std. Error"], digits = 3)
phi = round(-coef_estab, digits= 3)
se_phi = round(summary(eq_col)$coefficients["ln_estabs_lagged", "Std. Error"], digits = 3)
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(1+coef_estab)/(sigma-1+coef_prod/(1+coef_estab))*100
amp_se = deltamethod(~ -x2/(1+x3)/(sigma-1+x2/(1+x3))*100, coef(eq_col), vcov(eq_col))
table.col2 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

# Col 3
eq_col <- lm( ln_emp_per_estab_entrant ~ ln_GDP_worker, data = data_gdp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["ln_GDP_worker", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["ln_GDP_worker", "Std. Error"], digits = 3)
phi = 0
se_phi = 0
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(sigma-1+coef_prod)*100
amp_se = deltamethod(~ -x2/(sigma-1+x2)*100, coef(eq_col), vcov(eq_col))
table.col3 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)

# Col 4
eq_col <- lm(ln_emp_per_estab_entrant ~ ln_GDP_worker + ln_estabs_lagged, data = data_gdp_bds)
summary(eq_col)
coef_prod = summary(eq_col)$coefficients["ln_GDP_worker", "Estimate"]
coef_estab = summary(eq_col)$coefficients["ln_estabs_lagged", "Estimate"]
lambda = round(coef_prod+1, digits= 3)
se_lambda = round(summary(eq_col)$coefficients["ln_GDP_worker", "Std. Error"], digits = 3)
phi = round(-coef_estab, digits= 3)
se_phi = round(summary(eq_col)$coefficients["ln_estabs_lagged", "Std. Error"], digits = 3)
r2 = round(summary(eq_col)$r.squared, digits = 3)
amp = -coef_prod/(1+coef_estab)/(sigma-1+coef_prod/(1+coef_estab))*100
amp_se = deltamethod(~ -x2/(1+x3)/(sigma-1+x2/(1+x3))*100, coef(eq_col), vcov(eq_col))
table.col4 <- c(lambda,se_lambda,phi,se_phi,r2,nobs(eq_col),amp,amp_se)
table <- cbind(table.col1, table.col2, table.col3,table.col4)
table <- round(table, digits=3)

rnames <- c("lambdar", "se", "phi", "se", "R2", "N", "amp", "se")
cnames <- c("All estabs","All estabs lagged N","New estabs","New estabs lagged N")
tableA2 <- matrix(table,nrow=8,byrow=FALSE,dimnames=list(rnames,cnames))
View(tableA2)

# save regression results
write.csv(table1,file=paste(wkdir, "output/tables/table1.csv",  sep="/"), row.names=TRUE)
write.csv(tableA2,file=paste(wkdir, "output/tables/tableA2.csv",  sep="/"), row.names=TRUE)